필요 라이브러리 설치

In [ ]:
pip install scorecardpy
In [ ]:
pip install lightgbm

사전 환경 설정

  • ./input 폴더에 학습 데이터 저장

Library Import

In [120]:
# Library Import
import os
import gc
import random
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import missingno as msno
import plotly.express as px
import warnings
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, Lasso
import scorecardpy as sc

from plotly.offline import init_notebook_mode, plot
init_notebook_mode()

# igore warning message
warnings.filterwarnings("ignore")

# pandas display option setting
pd.set_option('display.max_columns', None)
%matplotlib inline

1. Data Load

File List Check

In [2]:
# File List Check
print(os.listdir("./input"))
['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'sample_submission.csv', 'submission.csv']

Data Load

In [3]:
app_test = pd.read_csv('./input/application_test.csv')
app_train = pd.read_csv('./input/application_train.csv')
bureau = pd.read_csv('./input/bureau.csv')
bureau_bal = pd.read_csv('./input/bureau_balance.csv')
cc_bal = pd.read_csv('./input/credit_card_balance.csv')
cash_bal = pd.read_csv('./input/POS_CASH_balance.csv')
pre_app = pd.read_csv('./input/previous_application.csv')
ins_pay = pd.read_csv('./input/installments_payments.csv')
In [4]:
# Data Shape Check
print (app_test.shape, app_train.shape, bureau.shape, bureau_bal.shape, cc_bal.shape, cash_bal.shape, pre_app.shape, ins_pay.shape)
(48744, 121) (307511, 122) (1716428, 17) (27299925, 3) (3840312, 23) (10001358, 8) (1670214, 37) (13605401, 8)

2.1. Target(Default) EDA(생략-Target에 대한 정의 및 데이터가 없기 때문에 수행 불가)

  • 전이율 분석(roll rate 분석)
    • 30일 연체 -> 60일 연체 -> 90일 연체로 넘어가는 비율 분석
  • Performance Period 분석
    • 9개월 -> 12개월 -> 15개월 -> 18개월 -> 24개월 기간의 Default의 증가 추이 분석
In [8]:
# Train / Test Data Merge
app_train['DataCategory'] = 'TRAIN'
app_test['DataCategory'] = 'TEST'
all_app = pd.concat([app_train, app_test], axis=0)
print (all_app.shape)
(356255, 123)
In [9]:
"""
# Category Type Column Name
"""
cat_cols = [col for col in all_app.select_dtypes(include='object').columns if col not in ['DataCategory']]
print (cat_cols)
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
In [10]:
print ('### Category Column Unique Values ###')
print ('-'*100)
for col in cat_cols:
    print ('%s : %s'%(col, list(all_app[col].unique())))
### Category Column Unique Values ###
----------------------------------------------------------------------------------------------------
NAME_CONTRACT_TYPE : ['Cash loans', 'Revolving loans']
CODE_GENDER : ['M', 'F', 'XNA']
FLAG_OWN_CAR : ['N', 'Y']
FLAG_OWN_REALTY : ['Y', 'N']
NAME_TYPE_SUITE : ['Unaccompanied', 'Family', 'Spouse, partner', 'Children', 'Other_A', nan, 'Other_B', 'Group of people']
NAME_INCOME_TYPE : ['Working', 'State servant', 'Commercial associate', 'Pensioner', 'Unemployed', 'Student', 'Businessman', 'Maternity leave']
NAME_EDUCATION_TYPE : ['Secondary / secondary special', 'Higher education', 'Incomplete higher', 'Lower secondary', 'Academic degree']
NAME_FAMILY_STATUS : ['Single / not married', 'Married', 'Civil marriage', 'Widow', 'Separated', 'Unknown']
NAME_HOUSING_TYPE : ['House / apartment', 'Rented apartment', 'With parents', 'Municipal apartment', 'Office apartment', 'Co-op apartment']
OCCUPATION_TYPE : ['Laborers', 'Core staff', 'Accountants', 'Managers', nan, 'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff', 'Private service staff', 'Medicine staff', 'Security staff', 'High skill tech staff', 'Waiters/barmen staff', 'Low-skill Laborers', 'Realty agents', 'Secretaries', 'IT staff', 'HR staff']
WEEKDAY_APPR_PROCESS_START : ['WEDNESDAY', 'MONDAY', 'THURSDAY', 'SUNDAY', 'SATURDAY', 'FRIDAY', 'TUESDAY']
ORGANIZATION_TYPE : ['Business Entity Type 3', 'School', 'Government', 'Religion', 'Other', 'XNA', 'Electricity', 'Medicine', 'Business Entity Type 2', 'Self-employed', 'Transport: type 2', 'Construction', 'Housing', 'Kindergarten', 'Trade: type 7', 'Industry: type 11', 'Military', 'Services', 'Security Ministries', 'Transport: type 4', 'Industry: type 1', 'Emergency', 'Security', 'Trade: type 2', 'University', 'Transport: type 3', 'Police', 'Business Entity Type 1', 'Postal', 'Industry: type 4', 'Agriculture', 'Restaurant', 'Culture', 'Hotel', 'Industry: type 7', 'Trade: type 3', 'Industry: type 3', 'Bank', 'Industry: type 9', 'Insurance', 'Trade: type 6', 'Industry: type 2', 'Transport: type 1', 'Industry: type 12', 'Mobile', 'Trade: type 1', 'Industry: type 5', 'Industry: type 10', 'Legal Services', 'Advertising', 'Trade: type 5', 'Cleaning', 'Industry: type 13', 'Trade: type 4', 'Telecom', 'Industry: type 8', 'Realtor', 'Industry: type 6']
FONDKAPREMONT_MODE : ['reg oper account', nan, 'org spec account', 'reg oper spec account', 'not specified']
HOUSETYPE_MODE : ['block of flats', nan, 'terraced house', 'specific housing']
WALLSMATERIAL_MODE : ['Stone, brick', 'Block', nan, 'Panel', 'Mixed', 'Wooden', 'Others', 'Monolithic']
EMERGENCYSTATE_MODE : ['No', nan, 'Yes']
In [11]:
"""
# 데이터 요약 정보 확인
"""
all_app.describe()
Out[11]:
SK_ID_CURR TARGET CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI TOTALAREA_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
count 356255.000000 307511.000000 356255.000000 3.562550e+05 3.562550e+05 356219.000000 3.559770e+05 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 121014.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356253.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 162345.000000 3.555870e+05 286622.000000 176307.000000 148671.000000 183392.000000 119949.000000 107895.000000 167175.000000 177848.000000 179914.000000 115147.000000 145411.000000 113276.000000 178353.000000 109394.000000 160489.000000 176307.000000 148671.000000 183392.000000 119949.000000 107895.000000 167175.000000 177848.000000 179914.000000 115147.000000 145411.000000 113276.000000 178353.000000 109394.000000 160489.000000 176307.000000 148671.000000 183392.000000 119949.000000 107895.000000 167175.000000 177848.000000 179914.000000 115147.000000 145411.000000 113276.000000 178353.000000 109394.000000 160489.000000 185200.000000 355205.000000 355205.000000 355205.000000 355205.000000 356254.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.000000 356255.00000 356255.000000 356255.000000 356255.000000 356255.000000 308687.000000 308687.000000 308687.000000 308687.000000 308687.000000 308687.000000
mean 278128.000000 0.080729 0.414316 1.701161e+05 5.877674e+05 27425.560657 5.280200e+05 0.020917 -16041.248841 64317.231413 -4983.593527 -3002.071163 12.023741 0.999994 0.818498 0.200098 0.998170 0.278612 0.071213 2.151858 2.050506 2.028932 12.055749 0.015649 0.051371 0.040847 0.078076 0.229661 0.178824 0.501965 5.148900e-01 0.509350 0.118138 0.088673 0.977889 0.752283 0.045045 0.079819 0.150015 0.227331 0.232817 0.066454 0.101495 0.108089 0.008868 0.028503 0.114914 0.087750 0.977239 0.759452 0.042930 0.075346 0.145471 0.223315 0.228878 0.065092 0.106382 0.106641 0.008116 0.027183 0.118549 0.088178 0.977903 0.755548 0.044994 0.078930 0.149494 0.226922 0.232504 0.067296 0.102674 0.109279 0.008697 0.028386 0.103193 1.425729 0.143452 1.409468 0.100198 -978.580852 0.000036 0.720504 0.000084 0.015065 0.087976 0.000171 0.082346 0.003977 0.000020 0.003537 0.000006 0.003043 0.002535 0.001044 0.008570 0.00023 0.007231 0.000514 0.000438 0.000289 0.005808 0.006281 0.029995 0.231697 0.304399 1.911564
std 102842.104413 0.272419 0.720378 2.235068e+05 3.986237e+05 14732.808190 3.660650e+05 0.013915 4358.803980 141705.532576 3526.968986 1517.901735 11.880848 0.002369 0.385434 0.400074 0.042741 0.448317 0.257181 0.907937 0.510947 0.504586 3.267576 0.124113 0.220753 0.197936 0.268292 0.420616 0.383206 0.210045 1.897531e-01 0.194141 0.108954 0.082312 0.057929 0.113267 0.077045 0.135249 0.100139 0.145051 0.161909 0.081287 0.093418 0.111194 0.047876 0.069880 0.108745 0.084076 0.063165 0.110112 0.075437 0.133025 0.101088 0.144126 0.161725 0.081911 0.098779 0.112555 0.046330 0.070723 0.109824 0.082017 0.058562 0.112057 0.077140 0.135133 0.100450 0.145453 0.162419 0.082267 0.094541 0.112881 0.047519 0.070574 0.108041 2.599914 0.456579 2.577724 0.368259 835.063902 0.006041 0.448752 0.009176 0.121812 0.283261 0.013084 0.274891 0.062942 0.004433 0.059366 0.002369 0.055077 0.050282 0.032297 0.092175 0.01517 0.084726 0.022659 0.020921 0.017001 0.079736 0.104250 0.191374 0.855949 0.786915 1.865338
min 100001.000000 0.000000 0.000000 2.565000e+04 4.500000e+04 1615.500000 4.050000e+04 0.000253 -25229.000000 -17912.000000 -24672.000000 -7197.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 -1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.013458 8.173617e-08 0.000527 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -4361.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 189064.500000 0.000000 0.000000 1.125000e+05 2.700000e+05 16731.000000 2.340000e+05 0.010006 -19676.000000 -2781.000000 -7477.000000 -4318.000000 5.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 2.000000 2.000000 2.000000 10.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.335503 3.949551e-01 0.368969 0.057700 0.044500 0.976700 0.687200 0.007900 0.000000 0.069000 0.166700 0.083300 0.018700 0.050400 0.045800 0.000000 0.000000 0.052500 0.041000 0.976700 0.699400 0.007300 0.000000 0.069000 0.166700 0.083300 0.016600 0.054200 0.043100 0.000000 0.000000 0.058300 0.044100 0.976700 0.691400 0.007900 0.000000 0.069000 0.166700 0.083300 0.018800 0.051300 0.046200 0.000000 0.000000 0.041500 0.000000 0.000000 0.000000 0.000000 -1592.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 278128.000000 0.000000 0.000000 1.530000e+05 5.002110e+05 25078.500000 4.500000e+05 0.018850 -15755.000000 -1224.000000 -4502.000000 -3252.000000 9.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 2.000000 2.000000 2.000000 12.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.506155 5.648491e-01 0.533482 0.088000 0.076500 0.981600 0.755200 0.021300 0.000000 0.137900 0.166700 0.208300 0.048200 0.075600 0.074900 0.000000 0.003600 0.084000 0.074900 0.981600 0.764800 0.019200 0.000000 0.137900 0.166700 0.208300 0.045900 0.077100 0.073300 0.000000 0.001100 0.087400 0.076100 0.981600 0.758500 0.021000 0.000000 0.137900 0.166700 0.208300 0.048700 0.077000 0.075400 0.000000 0.003100 0.069000 0.000000 0.000000 0.000000 0.000000 -771.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
75% 367191.500000 0.000000 1.000000 2.025000e+05 7.975575e+05 34960.500000 6.750000e+05 0.028663 -12425.000000 -290.000000 -1995.000000 -1717.000000 15.000000 1.000000 1.000000 0.000000 1.000000 1.000000 0.000000 3.000000 2.000000 2.000000 14.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.673344 6.629285e-01 0.665855 0.148500 0.112300 0.986600 0.823200 0.051900 0.120000 0.206900 0.333300 0.375000 0.085800 0.121000 0.131000 0.003900 0.027800 0.146000 0.112700 0.986600 0.823600 0.049300 0.120800 0.206900 0.333300 0.375000 0.084300 0.131300 0.125800 0.003900 0.023200 0.149400 0.111800 0.986600 0.825600 0.051800 0.120000 0.206900 0.333300 0.375000 0.087000 0.123100 0.131200 0.003900 0.026800 0.128700 2.000000 0.000000 2.000000 0.000000 -286.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000
max 456255.000000 1.000000 20.000000 1.170000e+08 4.050000e+06 258025.500000 4.050000e+06 0.072508 -7338.000000 365243.000000 0.000000 0.000000 91.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 21.000000 3.000000 3.000000 23.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.962693 8.549997e-01 0.896010 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 354.000000 34.000000 351.000000 24.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 4.000000 9.000000 8.000000 27.000000 261.000000 25.000000
In [13]:
all_app[all_app['DAYS_EMPLOYED'] >0]['DAYS_EMPLOYED'].describe()
Out[13]:
count     64648.0
mean     365243.0
std           0.0
min      365243.0
25%      365243.0
50%      365243.0
75%      365243.0
max      365243.0
Name: DAYS_EMPLOYED, dtype: float64
  • Application data의 DAYS_EMPLOYED의 경우 대출 실행일 이전에 고용일수를 나타내는 항목인데 365243 양수 값이 존재함
  • 64648건의 양수 값은 별도의 오류 데이터로 추정되어 nan값으로 변환
In [14]:
all_app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

2.2.2. Feature Missing Value Check

  • Application Data의 경우 120개 변수 중 67개 변수에 Missing Value가 존재
In [15]:
# Missing Value 존재하는 컬럼
missing_cols = app_train.columns[app_train.isnull().any()].tolist()
print ('## Missing Value # of Columns : %s'%len(missing_cols))
print (missing_cols)
## Missing Value # of Columns : 67
['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'OWN_CAR_AGE', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
  • COMMONAREA_MODE 컬럼의 경우 약 70%가 Missing 값으로 구성되어 있음(최대 Missing 컬럼)
In [16]:
app_train[missing_cols].isnull().sum().reset_index().rename(columns={0:'missing_cnt'}).sort_values('missing_cnt', ascending=False)[:1]
Out[16]:
index missing_cnt
41 COMMONAREA_MEDI 214865
In [18]:
"""
# Missing Value Count Check
"""
msno.bar(app_train[missing_cols], figsize=(16, 5), color='#34495e',labels=True, fontsize=8)
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x2400ebf2548>
In [20]:
"""
# Data Completeness Check
"""
msno.matrix(app_train[missing_cols], figsize=(14, 5), labels=True, fontsize=8)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x2403a7a3e08>
  • ata Completeness : 최대 67개, 최소 6개의 컬럼의 값이 존재(61개 컬럼 Null)하는 Row로 구성되어 있음
In [121]:
default_agg = {
    'TARGET' : ['sum', 'count']
}
In [122]:
cont_type_agg = app_train.groupby(['NAME_CONTRACT_TYPE']).agg({**default_agg}).reset_index()
cont_type_agg.columns = ['_'.join(col).strip() for col in cont_type_agg.columns.values]
cont_type_agg['TARGET_ratio'] = cont_type_agg['TARGET_sum']/cont_type_agg['TARGET_count'] * 100
cont_type_agg['CRatio'] = cont_type_agg['TARGET_count']/cont_type_agg['TARGET_count'].sum() * 100
cont_type_agg.head()
Out[122]:
NAME_CONTRACT_TYPE_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 Cash loans 23221 278232 8.345913 90.478715
1 Revolving loans 1604 29279 5.478329 9.521285
In [123]:
def_ratio = list((cont_type_agg['TARGET_ratio'].astype(str).values))
loan_type = list(cont_type_agg['NAME_CONTRACT_TYPE_'].values)

labels = []
for (item1, item2) in zip(loan_type, def_ratio):
    labels.append(item1 + '(Default Ratio:' + item2[:3] + '%)')
In [127]:
fig = px.pie(cont_type_agg, values='CRatio', names=labels, title='Product Component Ratio')
fig.show()
  • Revolving Loan 상품(부도율 5.4%)의 경우 최소 약정비율만 상환하면 자동으로 이월 되기 때문에 Cash Loan 상품(부도율 8.3%) 대비 부도율이 낮게 나타나고 있음
In [100]:
col_name = 'CODE_GENDER'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
Out[100]:
CODE_GENDER_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 F 14170 202448 7.00 65.8344
1 M 10655 105059 10.14 34.1643
2 XNA 0 4 0.00 0.0013
  • CODE_GENDER 값이 XNA인 4건의 경우 오류 데이터일 가능성이 높기 때문에 학습 데이터에서 제거 검토 필요
In [101]:
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].values)

labels = []
for (item1, item2) in zip(loan_type, def_ratio):
    labels.append(item1 + '(Default Ratio:' + item2 + '%)')
In [102]:
fig = px.pie(agg_df, values='CRatio', names=labels, title='Gender Component Ratio')
fig.show()
  • 대출 고객은 남성이 65.8%, 여성이 34.2%, 알수없음(XNA)가 0.0013%를 차지하고 있음
  • 부도율은 남성(10.14%)이 여성(7.0%)보다 3.14포인트 높게 나타남
In [103]:
col_name = 'FLAG_OWN_CAR'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
Out[103]:
FLAG_OWN_CAR_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 N 17249 202924 8.50 65.9892
1 Y 7576 104587 7.24 34.0108
In [104]:
col_name = 'FLAG_OWN_REALTY'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
Out[104]:
FLAG_OWN_REALTY_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 N 7842 94199 8.32 30.6327
1 Y 16983 213312 7.96 69.3673
  • FLAG_OWN_CAR, FLAG_OWN_REALTY 컬럼의 경우 각각 1.3포인트, 0.6포인트로 차 or 집의 보유 여부가 Default와는 큰 상관이 없음을 나타냄
In [105]:
col_name = 'CNT_CHILDREN'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df.head()
Out[105]:
CNT_CHILDREN_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 0 16609 215371 7.71 70.0368
1 1 5454 61119 8.92 19.8754
2 2 2333 26749 8.72 8.6986
3 3 358 3717 9.63 1.2087
4 4 55 429 12.82 0.1395
In [106]:
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)

labels = []
for (item1, item2) in zip(loan_type, def_ratio):
    labels.append(item1 + '(Default Ratio:' + item2 + '%)')
In [107]:
fig = px.pie(agg_df, values='CRatio', names=labels, title='# of Children Component Ratio')
fig.show()
  • 자녀수가 3명 초과의 경우 1% 미만의 구성비를 가지고 있기 때문에 3명 이상은 하나의 카테고리로 분류 검토 필요
  • 자녀수가 증가함에 따라 부도율은 7.71%(0명)에서 9.63%(3명)로 1.9포인트 상승함
In [108]:
col_name = 'NAME_TYPE_SUITE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
Out[108]:
NAME_TYPE_SUITE_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 Children 241 3267 7.38 1.0669
1 Family 3009 40149 7.49 13.1112
5 Spouse, partner 895 11370 7.87 3.7130
6 Unaccompanied 20337 248526 8.18 81.1596
2 Group of people 23 271 8.49 0.0885
In [109]:
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)

labels = []
for (item1, item2) in zip(loan_type, def_ratio):
    labels.append(item1 + '(Default Ratio:' + item2 + '%)')
In [110]:
fig = px.pie(agg_df, values='CRatio', names=labels, title='NAME_TYPE_SUITE Component Ratio')
fig.show()
  • 대출 신청시 가족이 같이 동행한(Family or Children) 고객의 경우 부도율이 7.49%, 7.38%로 다른 카테고리 대비 부도율이 낮음
In [111]:
col_name = 'NAME_INCOME_TYPE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
Out[111]:
NAME_INCOME_TYPE_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 Businessman 0 10 0.00 0.0033
5 Student 0 18 0.00 0.0059
3 Pensioner 2982 55362 5.39 18.0033
4 State servant 1249 21703 5.75 7.0576
1 Commercial associate 5360 71617 7.48 23.2892
In [112]:
def_ratio = list((agg_df['TARGET_ratio'].astype(str).values))
loan_type = list(agg_df[col_name+'_'].astype(str).values)

labels = []
for (item1, item2) in zip(loan_type, def_ratio):
    labels.append(item1 + '(Default Ratio:' + item2 + '%)')
In [113]:
fig = px.pie(agg_df, values='CRatio', names=labels, title='%s Component Ratio'%col_name)
fig.show()
  • Working 그룹이 51.6%의 구성비로 가장 높은 구성비를 나타내지만, 부도율은 9.59%로 평균 부도율 8%대비 1.59포인트 높게 나타남
  • 소득이 안정적인 연금수령자(Pensioner), 공무원(State servant)는 부도율이 각 5.39%, 5.75%로 낮게 나타남
  • 소득이 불안정한 실업자(Unemployed), 출산휴가(Maternity leave)는 부도율이 36.36%, 40.00%로 매우 높게 나타남
In [114]:
col_name = 'NAME_EDUCATION_TYPE'
agg_df = app_train.groupby([col_name]).agg({**default_agg}).reset_index()
agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
agg_df['TARGET_ratio'] = np.round(agg_df['TARGET_sum']/agg_df['TARGET_count'] * 100, 2)
agg_df['CRatio'] = np.round(agg_df['TARGET_count']/agg_df['TARGET_count'].sum() * 100, 4)
agg_df = agg_df.sort_values('TARGET_ratio')
agg_df.head()
Out[114]:
NAME_EDUCATION_TYPE_ TARGET_sum TARGET_count TARGET_ratio CRatio
0 Academic degree 3 164 1.83 0.0533
1 Higher education 4009 74863 5.36 24.3448
2 Incomplete higher 872 10277 8.48 3.3420
4 Secondary / secondary special 19524 218391 8.94 71.0189
3 Lower secondary 417 3816 10.93 1.2409

3. Simple Feature Engineering

  • Application Score 개발에 많이 활용되는 Application 정보 및 Bureau 정보를 활용한 간단한 Feature Engineering 수행
In [7]:
"""
# Label encoding
""" 
def label_encoding(df, cat_cols):
    le = LabelEncoder()
    #cat_cols = [col for col in df.select_dtypes(include='category').columns]
    for col in cat_cols:
        df[col] = df[col].fillna('unknown')
        df[col+'_le'] = le.fit_transform(df[col])
    return df
In [40]:
"""
# Application Data Label Encoding
""" 
all_app_label = label_encoding(all_app, cat_cols)
all_app_label.head()
Out[40]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR DataCategory NAME_CONTRACT_TYPE_le CODE_GENDER_le FLAG_OWN_CAR_le FLAG_OWN_REALTY_le NAME_TYPE_SUITE_le NAME_INCOME_TYPE_le NAME_EDUCATION_TYPE_le NAME_FAMILY_STATUS_le NAME_HOUSING_TYPE_le OCCUPATION_TYPE_le WEEKDAY_APPR_PROCESS_START_le ORGANIZATION_TYPE_le FONDKAPREMONT_MODE_le HOUSETYPE_MODE_le WALLSMATERIAL_MODE_le EMERGENCYSTATE_MODE_le
0 100002 1.0 Cash loans M N Y 0 202500.0 406597.5 24700.5 351000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 Laborers 1.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Business Entity Type 3 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.022 0.0198 0.0 0.0 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.00 reg oper account block of flats 0.0149 Stone, brick No 2.0 2.0 2.0 2.0 -1134.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0 TRAIN 0 1 0 1 6 7 4 3 1 8 6 5 2 0 5 0
1 100003 0.0 Cash loans F N N 0 270000.0 1293502.5 35698.5 1129500.0 Family State servant Higher education Married House / apartment 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 Core staff 2.0 1 1 MONDAY 11 0 0 0 0 0 0 School 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.079 0.0554 0.0 0.0 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.01 reg oper account block of flats 0.0714 Block No 1.0 0.0 1.0 0.0 -828.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 0 0 0 0 1 4 1 1 1 3 1 39 2 0 0 0
2 100004 0.0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 135000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.010032 -19046 -225.0 -4260.0 -2531 26.0 1 1 1 1 1 0 Laborers 1.0 2 2 MONDAY 9 0 0 0 0 0 0 Government NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN unknown unknown NaN unknown unknown 0.0 0.0 0.0 0.0 -815.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 1 1 1 1 6 7 4 3 1 8 1 11 4 3 7 2
3 100006 0.0 Cash loans F N Y 0 135000.0 312682.5 29686.5 297000.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.008019 -19005 -3039.0 -9833.0 -2437 NaN 1 1 0 1 0 0 Laborers 2.0 2 2 WEDNESDAY 17 0 0 0 0 0 0 Business Entity Type 3 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN unknown unknown NaN unknown unknown 2.0 0.0 2.0 0.0 -617.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN TRAIN 0 0 0 1 6 7 4 0 1 8 6 5 4 3 7 2
4 100007 0.0 Cash loans M N Y 0 121500.0 513000.0 21865.5 513000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.028663 -19932 -3038.0 -4311.0 -3458 NaN 1 1 0 1 0 0 Core staff 1.0 2 2 THURSDAY 11 0 0 0 0 1 1 Religion NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN unknown unknown NaN unknown unknown 0.0 0.0 0.0 0.0 -1106.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 0 1 0 1 6 7 4 3 1 3 4 37 4 3 7 2
In [41]:
print (all_app_label.shape)
(356255, 139)
In [42]:
# Category Features drop
all_app_label.drop(cat_cols, axis = 1, inplace=True)
print (all_app_label.shape)
(356255, 123)
  • 이전 대출 건수 변수 생성
    • Missing Value의 경우 대출 건수가 0건
In [45]:
bureau.head()
Out[45]:
SK_ID_CURR SK_ID_BUREAU CREDIT_ACTIVE CREDIT_CURRENCY DAYS_CREDIT CREDIT_DAY_OVERDUE DAYS_CREDIT_ENDDATE DAYS_ENDDATE_FACT AMT_CREDIT_MAX_OVERDUE CNT_CREDIT_PROLONG AMT_CREDIT_SUM AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM_LIMIT AMT_CREDIT_SUM_OVERDUE CREDIT_TYPE DAYS_CREDIT_UPDATE AMT_ANNUITY
0 215354 5714462 Closed currency 1 -497 0 -153.0 -153.0 NaN 0 91323.0 0.0 NaN 0.0 Consumer credit -131 NaN
1 215354 5714463 Active currency 1 -208 0 1075.0 NaN NaN 0 225000.0 171342.0 NaN 0.0 Credit card -20 NaN
2 215354 5714464 Active currency 1 -203 0 528.0 NaN NaN 0 464323.5 NaN NaN 0.0 Consumer credit -16 NaN
3 215354 5714465 Active currency 1 -203 0 NaN NaN NaN 0 90000.0 NaN NaN 0.0 Credit card -16 NaN
4 215354 5714466 Active currency 1 -629 0 1197.0 NaN 77674.5 0 2700000.0 NaN NaN 0.0 Consumer credit -21 NaN
In [43]:
"""
# 이전 대출 건수 변수 생성
"""
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()
Out[43]:
SK_ID_CURR previous_loan_counts
0 100001 7
1 100002 8
2 100003 4
3 100004 2
4 100005 3
In [44]:
"""
# Bureau Data의 count, mean, max, min, sum, std 변수 생성
"""
bureau_agg = bureau.drop(columns = ['SK_ID_BUREAU']).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum', 'std']).reset_index()
bureau_agg.head()
Out[44]:
SK_ID_CURR DAYS_CREDIT CREDIT_DAY_OVERDUE DAYS_CREDIT_ENDDATE DAYS_ENDDATE_FACT AMT_CREDIT_MAX_OVERDUE CNT_CREDIT_PROLONG AMT_CREDIT_SUM AMT_CREDIT_SUM_DEBT AMT_CREDIT_SUM_LIMIT AMT_CREDIT_SUM_OVERDUE DAYS_CREDIT_UPDATE AMT_ANNUITY
count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std count mean max min sum std
0 100001 7 -735.000000 -49 -1572 -5145 489.942514 7 0.0 0 0 0 0.0 7 82.428571 1778.0 -1329.0 577.0 1032.859277 4 -825.500000 -544.0 -1328.0 -3302.0 369.078582 0 NaN NaN NaN 0.000 NaN 7 0.0 0 0 0 0.0 7 207623.571429 378000.0 85500.0 1453365.000 122544.544510 7 85240.928571 373239.0 0.0 596686.5 137485.631124 6 0.00000 0.000 0.0 0.000 0.0000 7 0.0 0.0 0.0 0.0 0.0 7 -93.142857 -6 -155 -652 77.204120 7 3545.357143 10822.5 0.0 24817.5 4800.607529
1 100002 8 -874.000000 -103 -1437 -6992 431.451040 8 0.0 0 0 0 0.0 6 -349.000000 780.0 -1072.0 -2094.0 767.490977 6 -697.500000 -36.0 -1185.0 -4185.0 515.992539 5 1681.029 5043.645 0.0 8405.145 2363.2469 8 0.0 0 0 0 0.0 8 108131.945625 450000.0 0.0 865055.565 146075.557435 5 49156.200000 245781.0 0.0 245781.0 109916.604716 4 7997.14125 31988.565 0.0 31988.565 15994.2825 8 0.0 0.0 0.0 0.0 0.0 8 -499.875000 -7 -1185 -3999 518.522472 7 0.000000 0.0 0.0 0.0 0.000000
2 100003 4 -1400.750000 -606 -2586 -5603 909.826128 4 0.0 0 0 0 0.0 4 -544.500000 1216.0 -2434.0 -2178.0 1492.770467 3 -1097.333333 -540.0 -2131.0 -3292.0 896.097279 4 0.000 0.000 0.0 0.000 0.0000 4 0.0 0 0 0 0.0 4 254350.125000 810000.0 22248.0 1017400.500 372269.465535 4 0.000000 0.0 0.0 0.0 0.000000 4 202500.00000 810000.000 0.0 810000.000 405000.0000 4 0.0 0.0 0.0 0.0 0.0 4 -816.000000 -43 -2131 -3264 908.053963 0 NaN NaN NaN 0.0 NaN
3 100004 2 -867.000000 -408 -1326 -1734 649.124025 2 0.0 0 0 0 0.0 2 -488.500000 -382.0 -595.0 -977.0 150.613744 2 -532.500000 -382.0 -683.0 -1065.0 212.839141 1 0.000 0.000 0.0 0.000 NaN 2 0.0 0 0 0 0.0 2 94518.900000 94537.8 94500.0 189037.800 26.728636 2 0.000000 0.0 0.0 0.0 0.000000 2 0.00000 0.000 0.0 0.000 0.0000 2 0.0 0.0 0.0 0.0 0.0 2 -532.000000 -382 -682 -1064 212.132034 0 NaN NaN NaN 0.0 NaN
4 100005 3 -190.666667 -62 -373 -572 162.297053 3 0.0 0 0 0 0.0 3 439.333333 1324.0 -128.0 1318.0 776.274007 1 -123.000000 -123.0 -123.0 -123.0 NaN 1 0.000 0.000 0.0 0.000 NaN 3 0.0 0 0 0 0.0 3 219042.000000 568800.0 29826.0 657126.000 303238.426806 3 189469.500000 543087.0 0.0 568408.5 306503.339003 3 0.00000 0.000 0.0 0.000 0.0000 3 0.0 0.0 0.0 0.0 0.0 3 -54.333333 -11 -121 -163 58.594653 3 1420.500000 4261.5 0.0 4261.5 2460.378172
In [46]:
columns = ['SK_ID_CURR']

"""
# Bureau 변수 Column Name 생성 Bureau Column name + count/mean/max/min/sum/std
"""
for var in bureau_agg.columns.levels[0]:
    if var != 'SK_ID_CURR':
        for stat in bureau_agg.columns.levels[1][:-1]:
            columns.append('bureau_%s_%s' % (var, stat))
In [47]:
# Column Name Change
bureau_agg.columns = columns
bureau_agg.head()
Out[47]:
SK_ID_CURR bureau_DAYS_CREDIT_count bureau_DAYS_CREDIT_mean bureau_DAYS_CREDIT_max bureau_DAYS_CREDIT_min bureau_DAYS_CREDIT_sum bureau_DAYS_CREDIT_std bureau_CREDIT_DAY_OVERDUE_count bureau_CREDIT_DAY_OVERDUE_mean bureau_CREDIT_DAY_OVERDUE_max bureau_CREDIT_DAY_OVERDUE_min bureau_CREDIT_DAY_OVERDUE_sum bureau_CREDIT_DAY_OVERDUE_std bureau_DAYS_CREDIT_ENDDATE_count bureau_DAYS_CREDIT_ENDDATE_mean bureau_DAYS_CREDIT_ENDDATE_max bureau_DAYS_CREDIT_ENDDATE_min bureau_DAYS_CREDIT_ENDDATE_sum bureau_DAYS_CREDIT_ENDDATE_std bureau_DAYS_ENDDATE_FACT_count bureau_DAYS_ENDDATE_FACT_mean bureau_DAYS_ENDDATE_FACT_max bureau_DAYS_ENDDATE_FACT_min bureau_DAYS_ENDDATE_FACT_sum bureau_DAYS_ENDDATE_FACT_std bureau_AMT_CREDIT_MAX_OVERDUE_count bureau_AMT_CREDIT_MAX_OVERDUE_mean bureau_AMT_CREDIT_MAX_OVERDUE_max bureau_AMT_CREDIT_MAX_OVERDUE_min bureau_AMT_CREDIT_MAX_OVERDUE_sum bureau_AMT_CREDIT_MAX_OVERDUE_std bureau_CNT_CREDIT_PROLONG_count bureau_CNT_CREDIT_PROLONG_mean bureau_CNT_CREDIT_PROLONG_max bureau_CNT_CREDIT_PROLONG_min bureau_CNT_CREDIT_PROLONG_sum bureau_CNT_CREDIT_PROLONG_std bureau_AMT_CREDIT_SUM_count bureau_AMT_CREDIT_SUM_mean bureau_AMT_CREDIT_SUM_max bureau_AMT_CREDIT_SUM_min bureau_AMT_CREDIT_SUM_sum bureau_AMT_CREDIT_SUM_std bureau_AMT_CREDIT_SUM_DEBT_count bureau_AMT_CREDIT_SUM_DEBT_mean bureau_AMT_CREDIT_SUM_DEBT_max bureau_AMT_CREDIT_SUM_DEBT_min bureau_AMT_CREDIT_SUM_DEBT_sum bureau_AMT_CREDIT_SUM_DEBT_std bureau_AMT_CREDIT_SUM_LIMIT_count bureau_AMT_CREDIT_SUM_LIMIT_mean bureau_AMT_CREDIT_SUM_LIMIT_max bureau_AMT_CREDIT_SUM_LIMIT_min bureau_AMT_CREDIT_SUM_LIMIT_sum bureau_AMT_CREDIT_SUM_LIMIT_std bureau_AMT_CREDIT_SUM_OVERDUE_count bureau_AMT_CREDIT_SUM_OVERDUE_mean bureau_AMT_CREDIT_SUM_OVERDUE_max bureau_AMT_CREDIT_SUM_OVERDUE_min bureau_AMT_CREDIT_SUM_OVERDUE_sum bureau_AMT_CREDIT_SUM_OVERDUE_std bureau_DAYS_CREDIT_UPDATE_count bureau_DAYS_CREDIT_UPDATE_mean bureau_DAYS_CREDIT_UPDATE_max bureau_DAYS_CREDIT_UPDATE_min bureau_DAYS_CREDIT_UPDATE_sum bureau_DAYS_CREDIT_UPDATE_std bureau_AMT_ANNUITY_count bureau_AMT_ANNUITY_mean bureau_AMT_ANNUITY_max bureau_AMT_ANNUITY_min bureau_AMT_ANNUITY_sum bureau_AMT_ANNUITY_std
0 100001 7 -735.000000 -49 -1572 -5145 489.942514 7 0.0 0 0 0 0.0 7 82.428571 1778.0 -1329.0 577.0 1032.859277 4 -825.500000 -544.0 -1328.0 -3302.0 369.078582 0 NaN NaN NaN 0.000 NaN 7 0.0 0 0 0 0.0 7 207623.571429 378000.0 85500.0 1453365.000 122544.544510 7 85240.928571 373239.0 0.0 596686.5 137485.631124 6 0.00000 0.000 0.0 0.000 0.0000 7 0.0 0.0 0.0 0.0 0.0 7 -93.142857 -6 -155 -652 77.204120 7 3545.357143 10822.5 0.0 24817.5 4800.607529
1 100002 8 -874.000000 -103 -1437 -6992 431.451040 8 0.0 0 0 0 0.0 6 -349.000000 780.0 -1072.0 -2094.0 767.490977 6 -697.500000 -36.0 -1185.0 -4185.0 515.992539 5 1681.029 5043.645 0.0 8405.145 2363.2469 8 0.0 0 0 0 0.0 8 108131.945625 450000.0 0.0 865055.565 146075.557435 5 49156.200000 245781.0 0.0 245781.0 109916.604716 4 7997.14125 31988.565 0.0 31988.565 15994.2825 8 0.0 0.0 0.0 0.0 0.0 8 -499.875000 -7 -1185 -3999 518.522472 7 0.000000 0.0 0.0 0.0 0.000000
2 100003 4 -1400.750000 -606 -2586 -5603 909.826128 4 0.0 0 0 0 0.0 4 -544.500000 1216.0 -2434.0 -2178.0 1492.770467 3 -1097.333333 -540.0 -2131.0 -3292.0 896.097279 4 0.000 0.000 0.0 0.000 0.0000 4 0.0 0 0 0 0.0 4 254350.125000 810000.0 22248.0 1017400.500 372269.465535 4 0.000000 0.0 0.0 0.0 0.000000 4 202500.00000 810000.000 0.0 810000.000 405000.0000 4 0.0 0.0 0.0 0.0 0.0 4 -816.000000 -43 -2131 -3264 908.053963 0 NaN NaN NaN 0.0 NaN
3 100004 2 -867.000000 -408 -1326 -1734 649.124025 2 0.0 0 0 0 0.0 2 -488.500000 -382.0 -595.0 -977.0 150.613744 2 -532.500000 -382.0 -683.0 -1065.0 212.839141 1 0.000 0.000 0.0 0.000 NaN 2 0.0 0 0 0 0.0 2 94518.900000 94537.8 94500.0 189037.800 26.728636 2 0.000000 0.0 0.0 0.0 0.000000 2 0.00000 0.000 0.0 0.000 0.0000 2 0.0 0.0 0.0 0.0 0.0 2 -532.000000 -382 -682 -1064 212.132034 0 NaN NaN NaN 0.0 NaN
4 100005 3 -190.666667 -62 -373 -572 162.297053 3 0.0 0 0 0 0.0 3 439.333333 1324.0 -128.0 1318.0 776.274007 1 -123.000000 -123.0 -123.0 -123.0 NaN 1 0.000 0.000 0.0 0.000 NaN 3 0.0 0 0 0 0.0 3 219042.000000 568800.0 29826.0 657126.000 303238.426806 3 189469.500000 543087.0 0.0 568408.5 306503.339003 3 0.00000 0.000 0.0 0.000 0.0000 3 0.0 0.0 0.0 0.0 0.0 3 -54.333333 -11 -121 -163 58.594653 3 1420.500000 4261.5 0.0 4261.5 2460.378172
In [48]:
# Feature Engineering Result Merge
all_df = all_app_label.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
all_df = all_df.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
all_df.head()
Out[48]:
SK_ID_CURR TARGET CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI TOTALAREA_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR DataCategory NAME_CONTRACT_TYPE_le CODE_GENDER_le FLAG_OWN_CAR_le FLAG_OWN_REALTY_le NAME_TYPE_SUITE_le NAME_INCOME_TYPE_le NAME_EDUCATION_TYPE_le NAME_FAMILY_STATUS_le NAME_HOUSING_TYPE_le OCCUPATION_TYPE_le WEEKDAY_APPR_PROCESS_START_le ORGANIZATION_TYPE_le FONDKAPREMONT_MODE_le HOUSETYPE_MODE_le WALLSMATERIAL_MODE_le EMERGENCYSTATE_MODE_le previous_loan_counts bureau_DAYS_CREDIT_count bureau_DAYS_CREDIT_mean bureau_DAYS_CREDIT_max bureau_DAYS_CREDIT_min bureau_DAYS_CREDIT_sum bureau_DAYS_CREDIT_std bureau_CREDIT_DAY_OVERDUE_count bureau_CREDIT_DAY_OVERDUE_mean bureau_CREDIT_DAY_OVERDUE_max bureau_CREDIT_DAY_OVERDUE_min bureau_CREDIT_DAY_OVERDUE_sum bureau_CREDIT_DAY_OVERDUE_std bureau_DAYS_CREDIT_ENDDATE_count bureau_DAYS_CREDIT_ENDDATE_mean bureau_DAYS_CREDIT_ENDDATE_max bureau_DAYS_CREDIT_ENDDATE_min bureau_DAYS_CREDIT_ENDDATE_sum bureau_DAYS_CREDIT_ENDDATE_std bureau_DAYS_ENDDATE_FACT_count bureau_DAYS_ENDDATE_FACT_mean bureau_DAYS_ENDDATE_FACT_max bureau_DAYS_ENDDATE_FACT_min bureau_DAYS_ENDDATE_FACT_sum bureau_DAYS_ENDDATE_FACT_std bureau_AMT_CREDIT_MAX_OVERDUE_count bureau_AMT_CREDIT_MAX_OVERDUE_mean bureau_AMT_CREDIT_MAX_OVERDUE_max bureau_AMT_CREDIT_MAX_OVERDUE_min bureau_AMT_CREDIT_MAX_OVERDUE_sum bureau_AMT_CREDIT_MAX_OVERDUE_std bureau_CNT_CREDIT_PROLONG_count bureau_CNT_CREDIT_PROLONG_mean bureau_CNT_CREDIT_PROLONG_max bureau_CNT_CREDIT_PROLONG_min bureau_CNT_CREDIT_PROLONG_sum bureau_CNT_CREDIT_PROLONG_std bureau_AMT_CREDIT_SUM_count bureau_AMT_CREDIT_SUM_mean bureau_AMT_CREDIT_SUM_max bureau_AMT_CREDIT_SUM_min bureau_AMT_CREDIT_SUM_sum bureau_AMT_CREDIT_SUM_std bureau_AMT_CREDIT_SUM_DEBT_count bureau_AMT_CREDIT_SUM_DEBT_mean bureau_AMT_CREDIT_SUM_DEBT_max bureau_AMT_CREDIT_SUM_DEBT_min bureau_AMT_CREDIT_SUM_DEBT_sum bureau_AMT_CREDIT_SUM_DEBT_std bureau_AMT_CREDIT_SUM_LIMIT_count bureau_AMT_CREDIT_SUM_LIMIT_mean bureau_AMT_CREDIT_SUM_LIMIT_max bureau_AMT_CREDIT_SUM_LIMIT_min bureau_AMT_CREDIT_SUM_LIMIT_sum bureau_AMT_CREDIT_SUM_LIMIT_std bureau_AMT_CREDIT_SUM_OVERDUE_count bureau_AMT_CREDIT_SUM_OVERDUE_mean bureau_AMT_CREDIT_SUM_OVERDUE_max bureau_AMT_CREDIT_SUM_OVERDUE_min bureau_AMT_CREDIT_SUM_OVERDUE_sum bureau_AMT_CREDIT_SUM_OVERDUE_std bureau_DAYS_CREDIT_UPDATE_count bureau_DAYS_CREDIT_UPDATE_mean bureau_DAYS_CREDIT_UPDATE_max bureau_DAYS_CREDIT_UPDATE_min bureau_DAYS_CREDIT_UPDATE_sum bureau_DAYS_CREDIT_UPDATE_std bureau_AMT_ANNUITY_count bureau_AMT_ANNUITY_mean bureau_AMT_ANNUITY_max bureau_AMT_ANNUITY_min bureau_AMT_ANNUITY_sum bureau_AMT_ANNUITY_std
0 100002 1.0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.022 0.0198 0.0 0.0 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.00 0.0149 2.0 2.0 2.0 2.0 -1134.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0 TRAIN 0 1 0 1 6 7 4 3 1 8 6 5 2 0 5 0 8.0 8.0 -874.00 -103.0 -1437.0 -6992.0 431.451040 8.0 0.0 0.0 0.0 0.0 0.0 6.0 -349.0 780.0 -1072.0 -2094.0 767.490977 6.0 -697.500000 -36.0 -1185.0 -4185.0 515.992539 5.0 1681.029 5043.645 0.0 8405.145 2363.2469 8.0 0.0 0.0 0.0 0.0 0.0 8.0 108131.945625 450000.0 0.0 865055.565 146075.557435 5.0 49156.2 245781.0 0.0 245781.0 109916.604716 4.0 7997.14125 31988.565 0.0 31988.565 15994.2825 8.0 0.0 0.0 0.0 0.0 0.0 8.0 -499.875 -7.0 -1185.0 -3999.0 518.522472 7.0 0.0 0.0 0.0 0.0 0.0
1 100003 0.0 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.079 0.0554 0.0 0.0 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.01 0.0714 1.0 0.0 1.0 0.0 -828.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 0 0 0 0 1 4 1 1 1 3 1 39 2 0 0 0 4.0 4.0 -1400.75 -606.0 -2586.0 -5603.0 909.826128 4.0 0.0 0.0 0.0 0.0 0.0 4.0 -544.5 1216.0 -2434.0 -2178.0 1492.770467 3.0 -1097.333333 -540.0 -2131.0 -3292.0 896.097279 4.0 0.000 0.000 0.0 0.000 0.0000 4.0 0.0 0.0 0.0 0.0 0.0 4.0 254350.125000 810000.0 22248.0 1017400.500 372269.465535 4.0 0.0 0.0 0.0 0.0 0.000000 4.0 202500.00000 810000.000 0.0 810000.000 405000.0000 4.0 0.0 0.0 0.0 0.0 0.0 4.0 -816.000 -43.0 -2131.0 -3264.0 908.053963 0.0 NaN NaN NaN 0.0 NaN
2 100004 0.0 0 67500.0 135000.0 6750.0 135000.0 0.010032 -19046 -225.0 -4260.0 -2531 26.0 1 1 1 1 1 0 1.0 2 2 9 0 0 0 0 0 0 NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -815.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 1 1 1 1 6 7 4 3 1 8 1 11 4 3 7 2 2.0 2.0 -867.00 -408.0 -1326.0 -1734.0 649.124025 2.0 0.0 0.0 0.0 0.0 0.0 2.0 -488.5 -382.0 -595.0 -977.0 150.613744 2.0 -532.500000 -382.0 -683.0 -1065.0 212.839141 1.0 0.000 0.000 0.0 0.000 NaN 2.0 0.0 0.0 0.0 0.0 0.0 2.0 94518.900000 94537.8 94500.0 189037.800 26.728636 2.0 0.0 0.0 0.0 0.0 0.000000 2.0 0.00000 0.000 0.0 0.000 0.0000 2.0 0.0 0.0 0.0 0.0 0.0 2.0 -532.000 -382.0 -682.0 -1064.0 212.132034 0.0 NaN NaN NaN 0.0 NaN
3 100006 0.0 0 135000.0 312682.5 29686.5 297000.0 0.008019 -19005 -3039.0 -9833.0 -2437 NaN 1 1 0 1 0 0 2.0 2 2 17 0 0 0 0 0 0 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 2.0 0.0 -617.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN TRAIN 0 0 0 1 6 7 4 0 1 8 6 5 4 3 7 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 100007 0.0 0 121500.0 513000.0 21865.5 513000.0 0.028663 -19932 -3038.0 -4311.0 -3458 NaN 1 1 0 1 0 0 1.0 2 2 11 0 0 0 0 1 1 NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -1106.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 TRAIN 0 1 0 1 6 7 4 3 1 3 4 37 4 3 7 2 1.0 1.0 -1149.00 -1149.0 -1149.0 -1149.0 NaN 1.0 0.0 0.0 0.0 0.0 NaN 1.0 -783.0 -783.0 -783.0 -783.0 NaN 1.0 -783.000000 -783.0 -783.0 -783.0 NaN 1.0 0.000 0.000 0.0 0.000 NaN 1.0 0.0 0.0 0.0 0.0 NaN 1.0 146250.000000 146250.0 146250.0 146250.000 NaN 1.0 0.0 0.0 0.0 0.0 NaN 1.0 0.00000 0.000 0.0 0.000 NaN 1.0 0.0 0.0 0.0 0.0 NaN 1.0 -783.000 -783.0 -783.0 -783.0 NaN 0.0 NaN NaN NaN 0.0 NaN

4.Feature Selection

  • ML 알고리즘 기반 CSS 모형 개발시 변수 선택 방법
    • Recursive Feature Elimination(RFE)
      • 모형 성능에 영향을 적게주는 변수를 제거하면서 선택하는 방법
    • Embeded Logic
      • RandomForest, Xgboost, Lightgbm, Catboost 알고리즘의 Feature Importance 활용
    • Filter 방법
      • correlation, chi2 등의 통계값을 기준으로 필터링하여 선택하는 방법
  • Score Card 기반 CSS 모형 개발시 변수 선택 방법

4.1. RFE Function을 사용하여 변수 선택

  • 다중공선성을 고려하여 변수 선택(LASSO)
    • LASSO Normalization이 필요함
    • Missing Value 처리 필요함
In [49]:
"""
# 전체 Feature List
"""
feats = [col for col in all_df.columns if col not in ['SK_ID_CURR', 'TARGET', 'DataCategory']]
print (len(feats))
193
In [50]:
"""
# Missing Value를 mean값으로 변환
"""
all_df.fillna(all_df.mean(), inplace=True)
In [51]:
"""
# Data Split & Normalization
"""
X_train = all_df[all_df['DataCategory']=='TRAIN'][feats] 
X_test = all_df[all_df['DataCategory']=='TEST'][feats]

y_train = all_df[all_df['DataCategory']=='TRAIN']['TARGET'] 
y_test = all_df[all_df['DataCategory']=='TEST']['TARGET']

# Normalization
scaler = StandardScaler()
scaler.fit(all_df[feats])

X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)
print (len(X_train_norm), len(X_test_norm))
307511 48744
In [53]:
rfe_selector = RFE(estimator=Lasso(alpha=0.01, max_iter=3000), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_train_norm, y_train)
Fitting estimator with 193 features.
Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Out[53]:
RFE(estimator=Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=3000,
                    normalize=False, positive=False, precompute=False,
                    random_state=None, selection='cyclic', tol=0.0001,
                    warm_start=False),
    n_features_to_select=100, step=10, verbose=5)
In [54]:
rfe_support = rfe_selector.get_support()
rfe_feature = X_train.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'RFE Lasso selected features')
100 RFE Lasso selected features
In [58]:
print ('-'*120)
print (rfe_feature)
------------------------------------------------------------------------------------------------------------------------
['DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'CODE_GENDER_le', 'NAME_EDUCATION_TYPE_le', 'bureau_AMT_CREDIT_SUM_DEBT_count', 'bureau_AMT_CREDIT_SUM_DEBT_min', 'bureau_AMT_CREDIT_SUM_DEBT_sum', 'bureau_AMT_CREDIT_SUM_DEBT_std', 'bureau_AMT_CREDIT_SUM_LIMIT_count', 'bureau_AMT_CREDIT_SUM_LIMIT_mean', 'bureau_AMT_CREDIT_SUM_LIMIT_max', 'bureau_AMT_CREDIT_SUM_LIMIT_min', 'bureau_AMT_CREDIT_SUM_LIMIT_sum', 'bureau_AMT_CREDIT_SUM_LIMIT_std', 'bureau_AMT_CREDIT_SUM_OVERDUE_count', 'bureau_AMT_CREDIT_SUM_OVERDUE_mean', 'bureau_AMT_CREDIT_SUM_OVERDUE_max', 'bureau_AMT_CREDIT_SUM_OVERDUE_min', 'bureau_AMT_CREDIT_SUM_OVERDUE_sum', 'bureau_AMT_CREDIT_SUM_OVERDUE_std', 'bureau_DAYS_CREDIT_UPDATE_count', 'bureau_DAYS_CREDIT_UPDATE_mean', 'bureau_DAYS_CREDIT_UPDATE_max', 'bureau_DAYS_CREDIT_UPDATE_min', 'bureau_DAYS_CREDIT_UPDATE_sum', 'bureau_DAYS_CREDIT_UPDATE_std', 'bureau_AMT_ANNUITY_count', 'bureau_AMT_ANNUITY_mean', 'bureau_AMT_ANNUITY_max', 'bureau_AMT_ANNUITY_min', 'bureau_AMT_ANNUITY_sum', 'bureau_AMT_ANNUITY_std']

4.2. Information Value(Weight of evidence)를 활용한 변수 선택

  • Less than 0.02 : Not useful for prediction
  • 0.02 to 0.1 : Weak predictive Power
  • 0.1 to 0.3 : Medium predictive Power
  • 0.3 to 0.5 : Strong predictive Power
  • 0.5 over : Suspicious Predictive Power
In [62]:
"""
# Weight of evidence를 활용한 Binning
"""
bins = sc.woebin(all_df[all_df['DataCategory']=='TRAIN'], y="TARGET")
[INFO] creating woe binning ...
Binning on 307511 rows and 195 columns in 00:05:13
In [65]:
%%time
"""
# Information Value Check
"""
info_value = sc.iv(all_df[all_df['DataCategory']=='TRAIN'], y="TARGET", x=rfe_feature)
Wall time: 14min 22s
In [76]:
iv_feats = list(info_value[info_value['info_value']>=0.1]['variable'])
print (len(iv_feats), iv_feats)
45 ['bureau_AMT_CREDIT_SUM_LIMIT_mean', 'bureau_AMT_CREDIT_SUM_LIMIT_sum', 'bureau_AMT_CREDIT_SUM_LIMIT_std', 'bureau_AMT_CREDIT_SUM_LIMIT_max', 'bureau_AMT_ANNUITY_std', 'bureau_AMT_ANNUITY_mean', 'bureau_AMT_CREDIT_SUM_DEBT_min', 'bureau_AMT_ANNUITY_sum', 'EXT_SOURCE_1', 'bureau_AMT_CREDIT_SUM_DEBT_std', 'bureau_DAYS_CREDIT_UPDATE_mean', 'bureau_AMT_CREDIT_SUM_DEBT_sum', 'bureau_AMT_ANNUITY_max', 'bureau_DAYS_CREDIT_UPDATE_std', 'bureau_AMT_ANNUITY_min', 'bureau_DAYS_CREDIT_UPDATE_sum', 'EXT_SOURCE_2', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'EXT_SOURCE_3', 'DAYS_ID_PUBLISH', 'bureau_AMT_CREDIT_SUM_LIMIT_min', 'bureau_DAYS_CREDIT_UPDATE_min', 'DAYS_LAST_PHONE_CHANGE', 'LIVINGAREA_MODE', 'TOTALAREA_MODE', 'LIVINGAREA_MEDI', 'LIVINGAREA_AVG', 'bureau_AMT_CREDIT_SUM_OVERDUE_std', 'BASEMENTAREA_MODE', 'BASEMENTAREA_AVG', 'BASEMENTAREA_MEDI', 'NONLIVINGAREA_MEDI', 'LANDAREA_MEDI', 'NONLIVINGAREA_AVG', 'bureau_AMT_CREDIT_SUM_OVERDUE_mean', 'LANDAREA_AVG', 'LANDAREA_MODE', 'NONLIVINGAREA_MODE', 'COMMONAREA_MEDI', 'bureau_DAYS_CREDIT_UPDATE_max', 'COMMONAREA_MODE', 'COMMONAREA_AVG', 'APARTMENTS_AVG', 'bureau_AMT_CREDIT_SUM_OVERDUE_sum']
In [67]:
"""
# Train/Test Data를 WOE Bin 값으로 변경
"""
train_woe = sc.woebin_ply(X_train[iv_feats], bins)
test_woe = sc.woebin_ply(X_test[iv_feats], bins)
[INFO] converting into woe values ...
Woe transformating on 307511 rows and 45 columns in 00:00:39
[INFO] converting into woe values ...
In [68]:
%%time
"""
# LogisticRegression 모형 학습
"""
lr = LogisticRegression(penalty='l1', C=0.9, solver='saga', n_jobs=-1)
lr.fit(train_woe, y_train)

# predicted proability
train_pred = lr.predict_proba(train_woe)[:,1]
test_pred = lr.predict_proba(test_woe)[:,1]

print (roc_auc_score(y_train, train_pred))
0.7268896397875768
In [77]:
"""
# 학습한 LogisticRegression 모형과 WOE binning 결과를 기준으로 Scorecard 생성
"""
card = sc.scorecard(bins, lr, train_woe.columns)
In [70]:
"""
# Scorecard Sample
"""
card['bureau_AMT_CREDIT_SUM_LIMIT_sum']
Out[70]:
variable bin points
76 bureau_AMT_CREDIT_SUM_LIMIT_sum [-inf,2000.0) -2.0
77 bureau_AMT_CREDIT_SUM_LIMIT_sum [2000.0,64000.0) 6.0
78 bureau_AMT_CREDIT_SUM_LIMIT_sum [64000.0,inf) -4.0
In [71]:
%%time
"""
# Scorecard 기준 Score 계산
"""
train_score = sc.scorecard_ply(X_train[iv_feats], card, print_step=1)
test_score = sc.scorecard_ply(X_test[iv_feats], card, print_step=1)
Wall time: 44.9 s
In [72]:
"""
# 예측 등급의 안정성을 체크하기 위한 PSI(Population Stability Index) 지표 체크
"""
psi_chk = sc.perf_psi(
    score = {'train':train_score, 'test':test_score},
    label = {'train':y_train, 'test':y_test},
    x_tick_break = 30, #등급별 점수 Range
    return_distr_dat=True,
    show_plot=False
)
In [73]:
"""
# PSI 지표 결과
- PSI < 0.1 : 안정
- 0.1 <= PSI < 0.25 : 다소 불안정
- PSI >= 0.25 : 불안성
"""
psi_chk['psi']
Out[73]:
variable PSI
0 score 0.000271
In [74]:
psi_chk
Out[74]:
{'psi':   variable       PSI
 0    score  0.000271,
 'pic': {},
 'dat': {'score':           bin    N            badprob
  ae            test    train     train
  0   [360,450)  0.0   5222.0  0.340866
  1   [450,480)  0.0  10826.0  0.250785
  2   [480,510)  0.0  22391.0  0.181189
  3   [510,540)  0.0  36988.0  0.129853
  4   [540,570)  0.0  51377.0  0.086654
  5   [570,600)  0.0  56022.0  0.058656
  6   [600,630)  0.0  51882.0  0.040284
  7   [630,660)  0.0  38588.0  0.027029
  8   [660,810)  0.0  34215.0  0.017507}}

5. Hyperparameter Tunning

  • Grid Search의 경우 Parameter 탐색 시간이 오래걸리기 때문에 Random Search로 구현
In [78]:
N_FOLDS=5

def objective(params, iteration):
    """
    # Objective Function
    """
    if 'num_boost_round' in params.keys():
        del params['num_boost_round']

    cv_results = lgb.cv(params, dtrain, num_boost_round = 1000, nfold=N_FOLDS, early_stopping_rounds=50, metrics=['auc'], seed=2020)
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    params['num_boost_round'] = estimators
    
    return [score, params, iteration]
In [81]:
random.seed(2020)
MAX_EVALS = 100

def random_search(param_grid, max_evals=MAX_EVALS):
    """
    # Random Search for Hyperparameter optimization
    """
    results=pd.DataFrame(columns=['score', 'params', 'iteration'], index=list(range(max_evals)))
    
    for i in range(max_evals):
        # Choose Random Params in param_grid
        hyperparams = {k:random.sample(v,1)[0] for k,v in param_grid.items()}
        hyperparams['subsample'] = 1.0 if hyperparams['boosting_type']=='goss' else hyperparams['subsample']
        
        eval_results=objective(hyperparams, i)
        results.loc[i,:]=eval_results
    
    results.sort_values('score', ascending=False, inplace=True)
    results.reset_index(inplace=True)
    return results
In [82]:
"""
# Lightgbm Hyperparameter Grid
"""
lgb_param_grid = {
    'objective ' : ['binary'],
    'boosting_type' : ['gbdt'],
    'num_leaves' : list(range(20, 150)),
    'learning_rate' : list(np.logspace(np.log10(0.005), np.log10(0.3), base=10, num=100)),
    'subsample_for_bin' : list(range(2000, 10000, 2000)),
    'min_child_samples' : list(range(20, 500, 5)),
    'reg_alpha' : list(np.linspace(0, 1)),
    'reg_lambda' : list(np.linspace(0, 1)),
    'colsample_bytree' : list(np.linspace(0.6, 0.9, 10)),
    'subsample' : list(np.linspace(0.5, 0.9, 10)),
    'drop_rate': list(np.linspace(0.01, 0.05, 10))
}
In [83]:
# Data Type Change
dtrain = lgb.Dataset(data = X_train[rfe_feature], label=y_train)
In [84]:
%%time
print ('Running Hyperparameter Search...')
random_results = random_search(lgb_param_grid, 15)
Running Hyperparameter Search...
Wall time: 33min 45s
In [85]:
random_results.head()
Out[85]:
index score params iteration
0 6 0.748179 {'objective ': 'binary', 'boosting_type': 'gbd... 6
1 10 0.748057 {'objective ': 'binary', 'boosting_type': 'gbd... 10
2 7 0.74788 {'objective ': 'binary', 'boosting_type': 'gbd... 7
3 2 0.747488 {'objective ': 'binary', 'boosting_type': 'gbd... 2
4 9 0.74736 {'objective ': 'binary', 'boosting_type': 'gbd... 9
In [86]:
# Hyperparameter Setting
sel_params = random_results['params'].values
sel_param = sel_params[0]
print (sel_param)
{'objective ': 'binary', 'boosting_type': 'gbdt', 'num_leaves': 55, 'learning_rate': 0.006678791265481649, 'subsample_for_bin': 6000, 'min_child_samples': 175, 'reg_alpha': 0.7142857142857142, 'reg_lambda': 0.5918367346938775, 'colsample_bytree': 0.6333333333333333, 'subsample': 0.7666666666666666, 'drop_rate': 0.02333333333333333, 'num_boost_round': 1000}
In [87]:
# Model Train
gbdt = lgb.train(sel_param, dtrain)
pred_train = gbdt.predict(X_train[rfe_feature])
pred_test = gbdt.predict(X_test[rfe_feature])

Feature Importance Plot

In [88]:
feat_importance = pd.DataFrame()
feat_importance['feature'] = rfe_feature
feat_importance['importance'] = gbdt.feature_importance()
feat_importance = feat_importance[feat_importance['importance']>0]
feat_importance.shape
Out[88]:
(97, 2)
In [89]:
feat_importance
Out[89]:
feature importance
0 DAYS_EMPLOYED 2716
1 DAYS_REGISTRATION 2095
2 DAYS_ID_PUBLISH 2313
3 OWN_CAR_AGE 1430
5 FLAG_EMP_PHONE 205
... ... ...
95 bureau_AMT_ANNUITY_mean 340
96 bureau_AMT_ANNUITY_max 354
97 bureau_AMT_ANNUITY_min 303
98 bureau_AMT_ANNUITY_sum 456
99 bureau_AMT_ANNUITY_std 274

97 rows × 2 columns

In [90]:
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=feat_importance.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features')
plt.yticks(size=8)
plt.tight_layout()
In [91]:
print ('Train ROC Score : %s'%roc_auc_score(y_train, pred_train))
Train ROC Score : 0.7750770143058462
In [92]:
submission = pd.read_csv('./input/sample_submission.csv')
submission['TARGET'] = pred_test
submission.to_csv('./input/submission.csv', index=False)
In [ ]: